--
-- Copyright (c) 2020 lalawue
--
-- This library is free software; you can redistribute it and/or modify it
-- under the terms of the MIT license. See LICENSE for details.
--
--[[
    Chinese Ariticle Summarize with TextRank
    [1] https://zhuanlan.zhihu.com/p/55270310
    [2] https://www.letianbiji.com/machine-learning/page-rank.html
    [3] https://www.cnblogs.com/sandwichnlp/p/11596848.html
    [4] http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
]]
require("base.scratch")
local LacCore = require("lib.thulac")

local App = {}
App.__index = App

function App:printHelp()
    print("Usage: luajit text_rank.lua 'WORD_VECTORS.TXT' 'SUMMARIZE_COUNT' 'INPUT_TEXT'")
end

function App:info(fmt, ...)
    print("[TextRank] " .. string.format(fmt, ...))
end

function App:readAllContent(file_path)
    local f = type(file_path) == "string" and io.open(file_path, "rb") or nil
    if f then
        local data = f:read("*a")
        f:close()
        return data
    end
    return nil
end

function App:main(word_vectors_text, summarize_count, input_text)
    local content = self:readAllContent(input_text)
    if content == nil then
        self:printHelp()
        os.exit(0)
    end
    summarize_count = math.max(tonumber(summarize_count), 1)

    -- vectors generated by GloVe https://github.com/stanfordnlp/GloVe.git with training data from http://www.sogou.com/labs/
    local words_vector_tbl = self:loadGloveVectorWords(word_vectors_text)

    -- seperate sentence with paird { word, tag } bundle by http://thulac.thunlp.org/, modified version https://github.com/lalawue/THULAC
    local sentence_tbl = self:seperateSentenceFrom(content)

    -- generate sentence vector with averaged contained word vector
    local sentence_vector_tbl = self:getSentenceVectorFromWords(sentence_tbl, words_vector_tbl)

    -- calcuate sentence simularity when they contains same words
    local sentence_similarity_tbl = self:sentenceSimilarityMatrix(sentence_tbl, sentence_vector_tbl)

    -- score sentence with their similarity with page rank formula
    local sentence_tr_tbl = self:textRank(sentence_similarity_tbl)

    -- output result
    self:info("summarize with sentence_count %d", summarize_count)
    local count = #sentence_tr_tbl
    for i = count, count - summarize_count + 1, -1 do
        local tr = sentence_tr_tbl[i]
        print(tostring(count - i + 1) .. ")\t" .. self:concatSentenceLine(sentence_tbl[tr.index]))
    end
end

-- load words vector, every word has 50 vector
function App:loadGloveVectorWords(filename)
    local tbl = {}
    self:info("begin load Glove Vector Words from '%s'", filename)
    -- prepare vector words
    local content = self:readAllContent(filename)
    if content == nil then
        print("Failed to load Glove :", filename)
        os.exit(1)
    end
    -- assume every word vector has 50 weight
    local vector_dimension = nil
    local line_tbl = content:split("\n")
    for _, line in ipairs(line_tbl) do
        local t = line:split(" ")
        if Lang.isTable(t) and #t > 1 then
            local key = t[1]
            for i = 2, #t, 1 do
                t[i - 1] = t[i]
            end
            t[#t] = nil
            tbl[key] = t
            if vector_dimension == nil then
                vector_dimension = #t
            end
            assert(#t == vector_dimension, string.format("inconsistency vector dimention %d, %d", #t, vector_dimension))
        end
    end
    self._vector_dimension = vector_dimension
    self:info("end generate prepared Glove Vector Words with total line:%d dimention:%d", #line_tbl, vector_dimension)
    return tbl
end

-- seperate sentence with delimiter
function App:seperateSentenceFrom(content)
    local sentence_tbl = {}
    sentence_tbl[#sentence_tbl + 1] = {}

    local seperator_tbl = {
        ["？"] = true,
        ["。"] = true,
        ["\n"] = true
    }

    local quote_tbl = {
        ["“"] = "“",
        ["”"] = true
    }

    local in_quote = false

    local lac = LacCore.newLac("lib/models", "data/user_words.txt", 0, 0, 0)
    local count = lac:seg(content)
    for i = 1, count, 1 do
        local word, tag = lac:fetch(i)
        --print(word, tag)
        if tag == "w" and quote_tbl[word] then
            in_quote = (quote_tbl[word] == word)
        end
        if tag == "w" and seperator_tbl[word] and not in_quote then
            if #sentence_tbl[#sentence_tbl] > 0 then
                sentence_tbl[#sentence_tbl + 1] = {}
            end
        else
            local tbl = sentence_tbl[#sentence_tbl]
            tbl[#tbl + 1] = {word, tag}
        end
    end
    lac:fini()

    if #sentence_tbl[#sentence_tbl] <= 0 then
        sentence_tbl[#sentence_tbl] = nil
    end

    self:info("seperate sentence count %d", #sentence_tbl)
    assert(#sentence_tbl > 0, "invalid sentence count")
    return sentence_tbl
end

function App:concatSentenceLine(sentence_line_tbl)
    --table.dump(sentence_line_tbl)
    local tbl = {}
    for _, v in ipairs(sentence_line_tbl) do
        tbl[#tbl + 1] = v[1]
    end
    return table.concat(tbl, "")
end

-- stop word tag
local stop_words_tbl =
    table.readonly(
    {
        ["e"] = true,
        ["o"] = true,
        ["u"] = true,
        ["y"] = true,
        ["w"] = true,
        ["x"] = true,
        ["m"] = true,
        ["q"] = true,
        ["mq"] = true
    }
)

-- add table value
function App:tableAdd(t1, t2, maxn)
    local tbl = {}
    for i = 1, maxn, 1 do
        tbl[i] = (t1 and t1[i] or 0) + (t2 and t2[i] or 0)
    end
    return tbl
end

function App:tableApply(t1, func, maxn)
    local tbl = {}
    for i = 1, maxn, 1 do
        tbl[i] = func(t1[i] or 0)
    end
    return tbl
end

-- average sentence vector from containing words vector
function App:getSentenceVectorFromWords(sentence_tbl, words_vector_tbl)
    local sentence_vector_tbl = {}
    for i, s in ipairs(sentence_tbl) do
        local count = 0
        -- add all valid words vector to sentence
        for _, wt in ipairs(s) do
            local word, tag = wt[1], wt[2]
            if not stop_words_tbl[tag] then
                sentence_vector_tbl[i] =
                    self:tableAdd(sentence_vector_tbl[i], words_vector_tbl[word], self._vector_dimension)
                count = count + 1
            end
        end
        -- average
        if sentence_vector_tbl[i] then
            sentence_vector_tbl[i] =
                self:tableApply(
                sentence_vector_tbl[i],
                function(v)
                    return v / count
                end,
                self._vector_dimension
            )
        end
    end
    self:info("average sentence vector from containing verds vector")
    return sentence_vector_tbl
end

-- table cosine
function App:tableCosine(t1, t2, maxn)
    local numerator = 0
    local t1_denominator = 0
    local t2_denominator = 0
    for i = 1, maxn, 1 do
        local v1 = t1 and t1[i] or 0
        local v2 = t2 and t2[i] or 0
        numerator = numerator + v1 * v2
        t1_denominator = t1_denominator + v1 * v1
        t2_denominator = t2_denominator + v2 * v2
    end
    local denominator = (math.sqrt(t1_denominator) * math.sqrt(t2_denominator))
    return (denominator > 0) and (numerator / denominator) or 0
end

-- should consider every sentence each other vectors cosine similarity
function App:sentenceSimilarityMatrix(sentence_tbl, sentence_vector_tbl)
    local sentence_similarity_tbl = {}
    for i = 1, #sentence_tbl, 1 do
        sentence_similarity_tbl[i] = self:tableAdd(nil, nil, #sentence_tbl)
    end
    for i = 1, #sentence_tbl, 1 do
        for j = i + 1, #sentence_tbl, 1 do
            local value = self:tableCosine(sentence_vector_tbl[i], sentence_vector_tbl[j], #sentence_tbl)
            if value > 0.1 then
                sentence_similarity_tbl[i][j] = value
                sentence_similarity_tbl[j][i] = value
            else
                sentence_similarity_tbl[i][j] = 0
                sentence_similarity_tbl[j][i] = 0
            end
        end
    end
    self:info("calculate sentence similarity matrix")
    return sentence_similarity_tbl
end

-- one round matrix text rank
function App:textRank(sentence_similarity_tbl)
    local dimention = #sentence_similarity_tbl
    self:info("begin text rank with dimention %d", dimention)
    local sentence_tr_tbl = {}
    for i = 1, dimention, 1 do
        local total = 0
        for j = 1, dimention, 1 do
            if i ~= j then
                total = total + sentence_similarity_tbl[i][j]
            end
        end
        sentence_tr_tbl[i] = {index = i, value = 1}
    end
    local has_change = false
    repeat
        has_change = false
        for i = 1, dimention, 1 do
            local count = 0
            local total = 0
            for j = 1, dimention, 1 do
                if i ~= j and sentence_similarity_tbl[i][j] > 0.98 then
                    total = total + sentence_tr_tbl[j].value
                    count = count + 1
                end
            end
            if count > 0 then
                local new_value = 0.15 + 0.85 * (total / count)
                has_change = has_change or (new_value ~= sentence_tr_tbl[i].value)
                sentence_tr_tbl[i].value = new_value
            else
                sentence_tr_tbl[i].value = 0
            end
        end
    until not has_change
    self:info("sort text rank result")
    table.sort(
        sentence_tr_tbl,
        function(v1, v2)
            return v1.value < v2.value
        end
    )
    return sentence_tr_tbl
end

function App:oneRoundPageRank()
end

App:main(...)
